

### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below)
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose: cleans key columns (gender, organization-level variables, region, pay variables, basic rank)


### Execution time: ~20 minutes

### Inputs: 
# 1) /data/intermediate/country_03_limpiar_conjunto.[format1]



### Outputs:
# 1) /data/intermediate/country_04_limpiar_cols.[format1]
# 2) /data/clean/dictionaries/genero_comprimido.csv [only created if run anew, otherwise pre-created file serve as an input]


# *) /data/raw_qs/intermediate_temp/country_04_limpiar_cols (temp0).[format1] [only temporary file to avoid re-running full script in case of an error
# not necessary for executing the script]
# *) /data/raw_qs/intermediate_temp/country_04_limpiar_cols (temp1).[format1] [only temporary file to avoid re-running full script in case of an error
# not necessary for executing the script]



#
# SET-UP --------------------------------------------------------------------------------------------
#

rm(list=ls())

### Source the '00_global.R' script with required packages and functions
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))


# library(installr)
# updateR()

# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)

# '  ------------------------------------------------------------------------------------------------------------------------------------------------------
# READ DATA ------------------------------------------------------------------------------------------------------------------------------------------------
#

t0 = Sys.time() # record start time

### get columns used in this script (the code will read only those, unless other column names are manually added)
## first, ensure those column names are in the file to be read

col_names = names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_03_limpiar_conjunto.parquet")))


## compare names in the data file to the ones used in the script
col_select1 = col_names[sapply(col_names, function(c) any(grepl(c, tolower(readLines(rstudioapi::getActiveDocumentContext()$path)))))]
col_select1 = unique(c('row_id_org', col_select1, 'tipo_cargo', 'tipo_calificacion'))


# if(!is_empty(setdiff(col_select1, col_names))){stop('READING FILE ERROR: Some of the selected column names are not present in the datafile')}


### read the file
country = read_flex(file.path(main_dir, 'data', 'intermediate', "country_03_limpiar_conjunto"), format = 'parquet', col_select = col_select1)
dim(country)




### *checks -> compare same qs and parqet reading times 
# t0=Sys.time()
# country = read_flex(file.path(main_dir, 'data', 'intermediate', "country_03_limpiar_conjunto"), format = format1)
# t1=Sys.time()
# temp = read_flex(file.path(main_dir, 'data', 'intermediate', "country_03_limpiar_conjunto"), format = 'parquet', col_select = col_select1)
# t2=Sys.time()

### NOTE: At this stage it appears that .parquet with select columns is ~2.5x faster than .qs (!). The price is file size that's ~80% larger. 
### .rds is in-between - larger and faster than .qs, smaller and slower than .parqet



## set as DT if not already done
if(!any(grepl('data.table', class(country)))){setDT(country)}
gc()

beep() # signal once completed



### (*) checks > plot - distribution of observations by month
dim(country) # dimensions

temp = country[, .(n=.N), by = .(anyo_mes, dataset)] # distribution by month

g1 = temp %>%  as.data.frame() %>%  # plot
  ggplot(., aes(anyo_mes, n)) +
  # geom_bar(stat = 'identity', position = position_dodge(.9),
  #          fill = '#009da7')+
  geom_area(fill = '#009da7')+
  facet_wrap(~dataset)+
  scale_x_date(
    expand = expansion(mult = c(0.05,.05)),
    date_breaks = "1 year",               # show every month
    date_labels = "%Y"                  # format: Oct-21
  ) +
  labs(x = 'Mes', y = 'Numero de observaciones')+
  scale_y_continuous(labels = comma,
                     limits = c(0,NA),
                     expand = expansion(mult = c(0,.2)))+
  theme(
    axis.text.x = element_markdown(size = 15),
    axis.text.y = element_markdown(size = 15)
  )

g1
ggsave(g1,
       file = file.path('figures', 'distribution of payments by month (nobs).png'),
       width = 40, height = 20, unit = 'cm')

Sys.time()-t0

#
# '  -----------------------------------------------------------------------------------------------------
# GENDER -----------------------------------------------------------------------------------------------------------------
#

### add ID based on names [should be moved to 03 script by now]

# # faster than cur_group_id or group_indices (works, but is depreciated) and results in the same number of IDs
# country[, id := .GRP, by = .(nombres, paterno, materno)]
# 
# ### >(temp)save [should be moved to 03 script by now]
# 
# write_flex(x =  unique(country[, .(id, nombres, paterno, materno)]),
#            file.path(main_dir, 'data',  'intermediate_temp', "country_04_limpiar_cols (temp0)"), format = format1)


### clean gender assignment -------------------------------------------------------------------------------------------------------------------------------
### NOTE: Only run R script for that if anew specified as TRUE or if a file with cleaned (comprimido) gender names not already present
anew = F

if(!anew & 
   file.exists(file.path(main_dir, 'data', 'clean', 'dictionaries', paste0('genero_comprimido','.', format1)))){ # if not anew AND file exists, read it
  
  print('Reading file with clean gender names')
  genero_comprimido = read_flex(file.path(main_dir, 'data', 'clean', 'dictionaries', 'genero_comprimido'), format=format1)
  
}else if(anew |
         !file.exists(file.path(main_dir, 'code', '045_country_genero_clean.R'))){ # if anew OR file doesn't exist, then run the script and then read the file
  
  print('Creating and reading file with clean gender names')
  # write_parquet(country, file.path(main_dir, 'Data', 'Raw', 'country_example_genero.parquet'))
  
  source(file.path(main_dir, 'code', '045_country_genero_clean.R'))
  genero_comprimido = read_flex(file.path(main_dir, 'data', 'clean', 'dictionaries', 'genero_comprimido'), format=format1)
  
}


### add to full country data.frame
if(!any(grepl('data.table', class(genero_comprimido)))){setDT(genero_comprimido)}

genero_comprimido = genero_comprimido %>% rename(genero = genero_comprimido)
genero_comprimido = genero_comprimido[, .(id, name_full_original, name_full, genero)]

gc()
country <- genero_comprimido[country, on = "id"]
gc()


### gender tecnico gap (women should be ~40-44% of directivos)
# country[,uniqueN(id[tipo_estamento == "tecnico" &  genero == 'mujer'])/
#         uniqueN(id[tipo_estamento == "tecnico"]),
#       by = .(anyo)]
# 
# pr_top(country[tipo_estamento=='tecnico'], 'tipo_cargo', 30)


# country[, fmean(pago_bruto), by = .(, organismo_nombre)]

# country[1:100, c('name_full', 'genero')]
# table(country$tipo_estamento, country$genero, useNA='ifany')
# 
# temp <- country[, .(N = unique(genero)), by = name_full]
# temp$N %>% summary



### >(temp)save -----------------------------------------------------------------------------------------
# write_flex(x = country, file.path(main_dir, 'data',  'intermediate_temp', "country_04_limpiar_cols (temp1)"), format = format1)


# ' ------------------------------------------------------------------------------------------------------------
# ORGANIZATIONS ------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#

# country = read_flex(file.path(main_dir, 'data', 'intermediate_temp', "country_04_limpiar_cols (temp1)"), format = format1)
# if(!any(grepl('data.table', class(country)))){setDT(country)}
# gc()



### XX_country_organismo_clean.R is run to create my best raw approximation of organizational classification, but
### that file was then also cleaned by Christian Schuster and the Countryan counterpart, resulting in a clean version that can 
### be just read directly
org_names = read.xlsx(file.path(main_dir, 'data', 'clean', 'additional_gov_data', 'gov_country_organismo_comprimido.xlsx'))
setDT(org_names)

### clean codes-names pairs (as needed) ---------------------------------------------------------------------------------------------------------
### checks >  how many? 1 name matches 1 code?

# 1 duplicate (ab056) found by below checking code -> simple typo - one is '... del ranco', the other '...de ranco' (cor)
org_names <- unique(org_names[, organismo_nombre := fifelse(
  organismo_nombre == "delegacion presidencial provincial del ranco",
  "delegacion presidencial provincial de ranco",
  organismo_nombre
)])

country <- unique(country[, organismo_nombre := fifelse(
  organismo_nombre == "delegacion presidencial provincial del ranco",
  "delegacion presidencial provincial de ranco",
  organismo_nombre
)])


# serviu region de los rios has two codes, ap010 being 4:1 more prevalent - standardize
country$organismo_codigo[country$organismo_codigo == 'ap015'] = 'ap010'



### organismo names + nivel  ---------------------------------------------------------------------------------------------------------------------------------

#### add cleaned organismo columns based on the file created in the 'XX_country_organismo_clean.R' script
#### the original file has been amended by the government ('gov_' prefix) so read this if available
anew = F

if(!anew & 
   file.exists(file.path(main_dir, 'data', 'clean', 'additional_gov_data', 'gov_country_organismo_comprimido.xlsx'))){ # if not anew AND government-cleaned file exists, read it
  
  print('Reading file with clean organization names (government version)')
  organismo_comprimido = openxlsx::read.xlsx(file.path(main_dir, 'data', 'clean', 'additional_gov_data','gov_country_organismo_comprimido.xlsx'))
  
}else if(!anew & 
         file.exists(file.path(main_dir, 'data', 'clean', 'additional_gov_data', 'gov_country_organismo_comprimido.xlsx'))){ # if not anew AND file exists, read it
  
  print('Reading file with clean organization names')
  organismo_comprimido = openxlsx::read.xlsx(file.path(main_dir, 'data', 'clean', 'additional_gov_data', 'country_organismo_comprimido.xlsx'))
  
}else if(anew |
         !file.exists(file.path(main_dir, 'code', 'XX_country_organismo_clean.R'))){ # if anew OR file doesn't exist, then run the script and then read the file
  
  print('Creating and reading file with clean organization names')
  source(file.path(main_dir, 'code', 'XX_country_organismo_clean.R'))
  organismo_comprimido = openxlsx::read.xlsx(file.path(main_dir, 'data', 'clean', 'additional_gov_data', 'country_organismo_comprimido.xlsx'))
  
}

### combine with the dataframe
setDT(organismo_comprimido)

organismo_comprimido[, organismo_nombre_clean := clean_text(organismo_nombre)] # ensure there is no diacritics
organismo_comprimido[, organismo_nivel := str_to_sentence(organismo_nivel)]
organismo_comprimido  = organismo_comprimido[, .(organismo_codigo, organismo_nombre_clean,
                                                 organismo_sector_comprimido, organismo_nivel)]

country <- organismo_comprimido[country, on = "organismo_codigo"]
gc()


### any missing clean names? which ones?
country_org = unique(country[, .(organismo_codigo, organismo_nombre, organismo_nombre_clean, organismo_nivel)])
sf(is.na(country_org$organismo_nombre_clean))

### 13 missing names (but codes unique throughout) -> replace missing clean names with raw version
country[, organismo_nombre_clean := fifelse(test = is.na(organismo_nombre_clean),
                                          yes  = organismo_nombre,
                                          no   = organismo_nombre_clean)]

## also add appropriate nivel for them (needs to be done manually)
# first central-level orgs
country$organismo_nivel[country$organismo_nombre_clean == 'servicio de biodiversidad y areas protegidas'] = 'Central' 
country$organismo_nivel[country$organismo_nombre_clean == 'instituto forestal (infor)'] = 'Central' 

funique(country$organismo_nombre_clean[is.na(country$organismo_nivel)]) # check if any non-local orgs still left without nivel

country$organismo_nivel[!is.na(country$organismo_nivel)] = 'Local' # assign 'local' for all others


gc()

### (*) checks -> total annual budget - agree with external sources?
country[, .(value = fsum(pago_bruto)), by = .(anyo)]

# 0.21 * 64768069 # 21% of the 2023 budget according to La Tercera 
# 0.21 * 76727518 # 21% of the 2023 budget according to the BCN 

# (*) checks > gender gap
country[, .(value = mean_miss(pago_bruto[genero == 'hombre'])/mean_miss(pago_bruto[genero == 'mujer'])), by = .(anyo)]
country[, .(value = mean_miss(pago_liquido[genero == 'hombre'])/mean_miss(pago_liquido[genero == 'mujer'])), by = .(anyo)]

# tapply(country$pago_bruto, paste(country$anyo, country$organismo_anos), sum_miss)
# 1.317635e+12 + 1.482452e+13 # ✓ if one adds pago from both organismos to keep and to remove in 2023, then the number is virtually identical

#
# '  -----------------------------------------------------------------------------------------------------
# REGION -----------------------------------------------------------------------------------------------------
# 

### NOTE: Generally, region is an extremely clean column, but in a sample of 68 millon several hundred
### entries are off, mostly listing >1 region name in a string. Given how few observations that is
### and that it is difficult to reclassify anyway, let's just put those as NA


country[, region := if (.N < 100) NA else region, by = region]
# country = country %>% group_by(region) %>% 
#   mutate(region = ifelse(n() < 100, NA, region)) %>% ungroup()

### checks -> how many of each + no. unique (non-missing)
country$region %>% pr_isna()
country$region %>% table %>% sort

### stop the code if there are not exactly 16 non-missing regions in the data
if(fdistinct(na.omit(funique(country$region))) != 16){ 
  stop('Error: Number of (non-missing) region names different than 16')
}




#
# '  ---------------------------------------------------------------------------------------------------------------------------------------------
# SALARY ----------------------------------------------------------------------------------------------------------------------------
#

### NOTE: if we were to run the code for gender wage gap (below) on un-winsorized pago_bruto, we would get an
### odd value of 1.04 for 2019, while ~1.20 (as expected) for other years.
### This is because there are a few EXTREMELY HIGH values (max of winsorized values is 58M, unwinsoried go up to 
### 146,684M = 146 BILLION! In 2019 there are 8 women assinged to >100 billion)
### BASELINE: WINSORIZING IS STRICTLY NECESSARY!!!
# country[, fmean(pago_bruto[genero == 'hombre'])/fmean(pago_bruto[genero == 'mujer']), by = .(anyo)]

# 146684747524/1e6
# temp = country[country$pago_bruto > 100e9,]
# tapply(temp$genero, temp$anyo, sf)
# sf(temp$genero)
# sf(country$pago_bruto > 100242260)
# sf(country$pago_bruto_win > 100242260)


### + winsorize ---------------------------------------------------------------------------------------------------------------------------
min1 = 0 
max1 = 99.999/100 # easier to express than fractions 

# country = country %>% mutate(across(starts_with('pago'), ~Winsorize(., val = quantile(., probs = c(min1, max1), na.rm=T))))

# get column names that start with 'pago'
pago_cols <- grep("^pago", names(country), value = TRUE)

# winsorize

for (col in pago_cols) {
  print(col)
  country[[paste0(col, "")]] <- Winsorize(
    country[[col]],
    val = quantile(country[[col]], probs = c(min1, max1), na.rm=T)
  )
}



### + total salary---------------------------------------------------------------------------------


# total pay (all variables apart from pago_bruto and, if they exists, totals and inflation_adjusted values)

if(!any(grepl('data.table', class(country)))){setDT(country)}

# pago_cols <- grep("^pago", names(country), value = TRUE)
# pago_cols <- setdiff(pago_cols, grep("(_total$|_inf$)|^pago_liquido$", pago_cols, value = TRUE))
# country[, pago_total := rowSums(.SD, na.rm = TRUE), .SDcols = pago_cols]

country[, pago_overtime_total := rowSums(.SD, na.rm = TRUE), 
      .SDcols = c("pago_horas_diurnas", "pago_horas_nocturnas", "pago_horas_festivas")]

country[, pago_extra_total := rowSums(.SD, na.rm = TRUE), 
      .SDcols = c('pago_adicional', 'pago_incentivos', 'pago_viatico',
                  "pago_horas_diurnas", "pago_horas_nocturnas", "pago_horas_festivas")]



# country <- country %>%
#   mutate(
#     pago_total = rowSums(across(c(matches('^pago'), -matches('_total$'), -matches('_inf$'), -pago_liquido), ~ replace_na(.x, 0))),
#     pago_overtime_total = rowSums(across(c(pago_horas_diurnas, pago_horas_nocturnas, pago_horas_festivas), ~ replace_na(.x, 0))),
#   )


## checks > NA's on pago_total need to be lower than on pago_bruto/liquida (depending on which one used)
pr_na(country$pago_total <= 0)
pr_na(country$pago_bruto <= 0)
pr_na(country$pago_liquido <= 0)

# (*) checks > gender gap
country[, .(value = mean_miss(pago_bruto[genero == 'hombre'])/mean_miss(pago_bruto[genero == 'mujer'])), by = .(anyo)]
country[, .(value = mean_miss(pago_liquido[genero == 'hombre'])/mean_miss(pago_liquido[genero == 'mujer'])), by = .(anyo)]



### + bandos salarial --------------------------------------------------------------------------------------------------------------------------------
## note: needs to be done after having final set of observations as this might affect the calculations
## of the quantiles

pago_bands1 <- c(0, (quantile(country$pago_bruto, probs = c(0.2, .4, .6, .8,
                                                           #.95, .99, 
                                                           1), na.rm=T)))
options(scipen = 999) # temporarily disable scientific notation

### define band labels in the format "band_[min]_[max]"
# pago_bands_labels1 <- paste0("band_", head(pago_bands1, -1), "_", pago_bands1[-1])

### define pay bands descriptivelu (in Spanish)
pago_bands_labels1 = c('primero quintil salarial (el más bajo)',
                       'segundo quintil salarial',
                       'tercer quintil salarial',
                       'cuarto quintil salarial',
                       'quinto quintil salarial (el más alto)')

### create bands
country = country %>%
  select(-c(matches('banda_salarial'))) %>% 
  mutate(banda_salarial = cut(
    pago_bruto,
    breaks = pago_bands1, # use quantile values as breaks
    labels = pago_bands_labels1, # label each quantile group
    right = F # using F results in more equally distributed quintiles, even though, due to clustering at certain pay scales, they are still unequal in size
  ),
  # banda_salarial2 = ntile(pago, 5) # cuts into perfectly sized bins, but at the cost of putting some people getting the same pay into different bins
  )

# checks > results ok? R: should be ~20% each (unlike in Country quintiles are close to 20% each as there are 
# fewer widely applicable salary bands)
pr(country$banda_salarial)
# pr(country$banda_salarial2)

### checks -> do 'pago_total' values match the desired quantile ranges?
tapply(country$pago_bruto, country$banda_salarial, summary)


dim(country)
beep()

#
# '  -----------------------------------------------------------------------------------------------------
# RANK -----------------------------------------------------------------------------------------------------------------
#

### checks > rank missing only for codigo/honorarios (see Christian's email from 30/04/2025) -> TRUE
table(country$tipo_estamento, country$dataset, useNA = 'ifany')


### read file with unified (compressed/comprimido) ranks (done manually by Christian and Robert)
rank_comprimido = openxlsx::read.xlsx(file.path(main_dir, 'data', 'clean', 'dictionaries', 'country_rank_comprimido.xlsx')) %>% 
  mutate(across(where(is.character), ~ clean_text(tolower(str_trim(.))))) %>%
  filter(!is.na(tipo_estamento))

# make any changes as necessary
rank_comprimido = rank_comprimido %>%
  mutate(tipo_estamento_comprimido = case_when(
    tipo_estamento_comprimido %in% c('medicos / personal de salud', 'docente') ~ 'profesional',
    # tipo_estamento_comprimido %in% c('alcalde') ~ 'directivo',
    tipo_estamento_comprimido %in% c('alcalde') ~ NA,
    .default = tipo_estamento_comprimido
  )) %>% 
  dplyr::select(c(tipo_estamento, tipo_estamento_comprimido)) %>% 
  setDT





### over-write this one, as otherwise it matches wrongly (only 2 entries?)
# funique(rank_comprimido$tipo_estamento)[!funique(rank_comprimido$tipo_estamento) %in% country$tipo_estamento] # to check the above
country$tipo_estamento[country$tipo_estamento == '(1) presidente del consejo directivo'] = 'presidente(a) del consejo directivo'

### combine
country = rank_comprimido[country, on = 'tipo_estamento']


### if rank missing = 'estamento no definido'
country[, tipo_estamento_comprimido := fifelse(is.na(tipo_estamento_comprimido), 'estamento no definido', tipo_estamento_comprimido)]


### checks > raw and clean estamento tabulated
table(country$tipo_estamento, country$tipo_estamento_comprimido, useNA = 'ifany')



# (*) checks > gender gap directivos numbers
country[, uniqueN(name_full_original[tipo_estamento_comprimido == "directivo" &  genero == 'mujer'])/
        uniqueN(name_full_original[tipo_estamento_comprimido == "directivo"]),
      by = .(anyo)]



# checks > gender pay gap by rank
country[, fmean(pago_bruto[genero == 'hombre' & tipo_estamento_comprimido == 'directivo'])/
        fmean(pago_bruto[genero == 'mujer' & tipo_estamento_comprimido == 'directivo']), by = .(anyo)]

country[, fmean(pago_bruto[genero == 'hombre' & tipo_estamento_comprimido == 'tecnico'])/
        fmean(pago_bruto[genero == 'mujer' & tipo_estamento_comprimido == 'tecnico']), by = .(anyo)]


# no. of rows and row IDs
nrow(country)
fdistinct(country$row_id_org)

### > final save -----------------------------------------------------------------------------------------
gc()
write_flex(x = country, file.path(main_dir, 'data',  'intermediate', "country_04_limpiar_cols"), format = format1)


beep()
exec_time_fun('exec_time')




#
# FIN DEL CÓDIGO  --------------------------------------------------------------------------------------------
# 